Predict Loan Defaults by leveraging XgBoost. The structure of the notebook is as follows:-
import sys
# data manipulation
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
import numpy as np
# autoreload magic for developing local packages
%load_ext autoreload
%autoreload 2
# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # higher resolution plots
import seaborn as sns
from pandas_profiling import ProfileReport
sns.set_context("poster")
sns.set(rc={'figure.figsize': (8, 5.)})
sns.set_style("whitegrid")
import chart_studio.plotly as py
import plotly.express as px
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
def read_and_preprocess_file(file_location):
data = pd.read_csv(file_location)
data["loan_amount"] = data.loan_amount.str.replace("$", "")
data["insured_amount"] = data.insured_amount.str.replace("$", "")
data["loan_amount"] = data.loan_amount.str.replace(",", "")
data["insured_amount"] = data.insured_amount.str.replace(",", "")
data["loan_amount"] = data.loan_amount.astype(float)
data["insured_amount"] = data.insured_amount.astype(float)
data["insured_minus_loan"] = data["insured_amount"] - data["loan_amount"]
dd_mmm_yy = data.request_date.str.split("-", expand=True)
dd_mmm_yy[2] = "20"+dd_mmm_yy[2]
data["request_date"] = dd_mmm_yy[2]+"-"+dd_mmm_yy[1]+"-"+dd_mmm_yy[0]
data["request_date"] = pd.to_datetime(
data.request_date, format="%Y-%b-%d")
data['request_year'] = pd.DatetimeIndex(data['request_date']).year
data['request_year'] = data.request_year.astype('category')
data['business_type'] = data.business_type.astype('category')
data['request_month'] = pd.DatetimeIndex(data['request_date']).month
return data
train_data = read_and_preprocess_file("data/train.csv")
test_data = read_and_preprocess_file("data/test.csv")
train_data.shape, test_data.shape
train_data.head()
This library generates all the relevant EDA plots and correlations for the relevant columns. Given that this is a small dataset
I took adavantage of the library. I have also saved the visualizations to the file visualizations_abhinav.html which is easier to read than the widget below. Do refer to the HTML report as well.
profile = ProfileReport(train_data.drop("id", axis=1), title="Predict Loan Defaults Report")
profile.to_widgets()
profile.to_file(output_file='visualization_abhinav.html')
I am using the year and month information since both the train and test datasets come from the same time period.
2009 compared to 2010. Could we attribute this to the 2008 financial crisis?fig, ax = plt.subplots( 1,2 , figsize=(18,5))
sns.countplot(x='request_year', hue='default_status',
data=train_data, ax = ax[0]).set_title("Loan defaults by year")
sns.countplot(x='request_month', hue='default_status',
data=train_data).set_title('Loan defaults by month');
fig, ax = plt.subplots( 1,2 , figsize=(18,5))
sns.countplot(x='other_loans', hue='default_status',
data=train_data, ax=ax[0]).set_title('Loan defaults by whether customer has other loans');
sns.countplot(x='business_type', hue='default_status',
data=train_data, ax = ax[1]).set_title('Loan defaults by business type');
fig, ax = plt.subplots( 1,2 , figsize=(18,5))
sns.countplot(x='business_new', hue='default_status',
data=train_data, ax = ax[0]).set_title('Loan defaults by business new or old')
sns.countplot(x='industry', hue='default_status',
data=train_data, ax = ax[1]).set_title('Loan defaults by industry')
plt.xticks(rotation=90);
A few points to note are:-
not_in_test = list(set(train_data.state.unique()) - set(test_data.state.unique()))
not_in_test
train_data = train_data.loc[~train_data.state.isin(not_in_test)]
set(train_data.state.unique()) - set(test_data.state.unique())
def get_default_percentage(group):
num_default= len(group[group.default_status==1])
return pd.Series({"default_percentage": num_default*100./len(group)})
state_df = train_data.groupby("state").apply(get_default_percentage)
state_df["state"] = state_df.index
state_df.reset_index(inplace=True, drop=True)
state_df.head()
fig = px.choropleth(state_df, # Input Pandas DataFrame
locations="state", # DataFrame column with locations
color="default_percentage", # DataFrame column with color values
hover_name="state", # DataFrame column hover info
locationmode = 'USA-states') # Set to plot as US States
fig.update_layout(
title_text = 'State by loan default percentage', # Create a Title
geo_scope='usa', # Plot only the USA instead of globe
)
fig.show() # Output the plot to the screen
train_data[["loan_amount","insured_amount","insured_minus_loan", "default_status"]].corr()
train_data.drop(["location","request_date", "insured_amount"], axis=1, inplace=True)
test_data.drop(["location","request_date","insured_amount"], axis=1, inplace=True)
cat_columns = []
for index in train_data.dtypes.index:
if index != 'user_id' and index != 'is_unengaged':
if(not("float" in str(train_data.dtypes[index])
or "int" in str(train_data.dtypes[index]))):
cat_columns.append(index)
cat_columns
train_data = pd.get_dummies(train_data, columns=cat_columns)
test_data = pd.get_dummies(test_data, columns=cat_columns)
train_data.shape, test_data.shape
Stratify using the dependent variable default_status
train, valid = train_test_split(train_data, test_size=0.2, random_state=42,
stratify=train_data.default_status)
print(train.shape, valid.shape)
default_status_train = train.default_status
id_train = train.id
train = train[train.columns.difference(['id', 'default_status'])]
train = train.reindex(sorted(train.columns), axis=1)
X_train = np.array(train)
y_train = np.array(default_status_train.astype('category'))
print(X_train.shape, y_train.shape)
default_status_valid = valid.default_status
id_valid = valid.id
valid = valid[valid.columns.
difference(['id', 'default_status'])]
valid = valid.reindex(sorted(valid.columns), axis=1)
X_valid = np.array(valid)
y_valid = np.array(default_status_valid.astype('category'))
print(X_valid.shape, y_valid.shape)
Use grid search CV for the determining the best model parameters.
max_depth = [4, 6, 8]
n_estimators = [60, 70, 80, 90, 100]
params = {
'scale_pos_weight': [1.],
# 'early_stopping_rounds': [8],
'max_depth':max_depth,
'subsample': [.8],
'colsample_bytree': [.8],
'n_estimators': n_estimators,
'learning_rate': [0.05],
'nthread': [5],
'tree_method': ['hist']
}
xgb_classifier = xgb.XGBClassifier()
rs = GridSearchCV(xgb_classifier, params, cv=3,
n_jobs=1, scoring='f1', verbose=1)
grid_result = rs.fit(X_train, y_train)
gbm = rs.best_estimator_
print(f"best score GBM: {rs.best_score_}\n"
f"best params GBM: {rs.best_params_}")
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = rs.cv_results_['mean_test_score']
stds = rs.cv_results_['std_test_score']
params = rs.cv_results_['params']
# plot results
scores = np.array(means).reshape(len(max_depth), len(n_estimators))
f = plt.figure(figsize=(10, 5))
for i, value in enumerate(max_depth):
plt.plot(n_estimators, scores[i], label='depth: ' + str(value), linewidth=3)
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('F1 score');
The scale_pos_weight parameters captures the slight imbalance in the data by weighting the errors on misclassifying the loan default around 2 times as much as the error in misclassifying a non default.
Choose max_depth as 6 and the n_estimators as 120 based on the plot above.
scale_pos_weight = y_train.shape[0]/np.sum(y_train == 1)-1.0
scale_pos_weight
params = {
'scale_pos_weight': [1., scale_pos_weight],
'gamma':[1., 2., 0.],
'max_depth':[6],
'subsample': [.8],
'colsample_bytree': [.8],
'n_estimators': [80],
'learning_rate': [0.05],
'nthread': [5],
'tree_method': ['hist']
}
xgb_classifier = xgb.XGBClassifier()
rs = GridSearchCV(xgb_classifier, params, cv=3,
n_jobs=1, scoring='f1', verbose=1)
grid_result = rs.fit(X_train, y_train)
gbm = rs.best_estimator_
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
The metrics captured on the validation set are:-
"""
Compute binary classification metrics
"""
def compute_validation_acc(model, features_test, target):
predict_proba = model.predict_proba(features_test)
predicted = model.predict(features_test)
pred_prob = np.array(predict_proba[:, 1])
area_under_curve = roc_auc_score(target, pred_prob)
labels = ['non default', 'defaults']
print("accuracy score:", accuracy_score(
target, model.predict(features_test)))
print(target.shape, np.sum(predicted))
cm = confusion_matrix(target, predicted)
print(cm)
sns.set(rc={'figure.figsize': (18, 5)})
fig, ax = plt.subplots(1, 3)
fig.suptitle("Metrics for binary classification", fontsize=16)
# confusion matrix
akws = {"ha": 'center', "va": 'center', "size": 17}
# annot=True to annotate cells
sns.heatmap(cm, annot=True, ax=ax[0], annot_kws=akws)
ax[0].set_xlabel('Predicted labels')
ax[0].set_ylabel('True labels')
ax[0].set_title('Confusion Matrix')
ax[0].xaxis.set_ticklabels(labels)
ax[0].yaxis.set_ticklabels(labels)
# ROC
fpr, tpr, _ = metrics.roc_curve(target, pred_prob)
ax[1].plot(fpr, tpr, label=f"auc = {area_under_curve:.3f}")
ax[1].set_xlabel('True positive rate')
ax[1].set_title('ROC curve')
ax[1].set_ylabel('False positive rate')
ax[1].legend(loc="lower right")
# precision recall
precision, recall, thresholds = precision_recall_curve(target, pred_prob)
f1 = f1_score(target, predicted)
auc_pr = auc(recall, precision)
ap = average_precision_score(target, pred_prob)
print('f1=%.3f auc_pr=%.3f avg_pr=%.3f auc=%.3f' %
(f1, auc_pr, ap, area_under_curve))
ax[2].plot(precision, recall, label=f"auc_pr = {auc_pr:.3f}")
ax[2].set_title('PR Curve')
ax[2].set_xlabel('Recall')
ax[2].set_ylabel('Precision')
ax[2].legend(loc="upper right")
compute_validation_acc(gbm, X_valid,y_valid.astype(int))
sorted_idx = np.argsort(gbm.feature_importances_)[::-1]
feature_importance = {}
for index in sorted_idx:
feature_importance[train.columns[index]
] = gbm.feature_importances_[index]
n_features = X_train.shape[1]
ss = sorted(feature_importance, key=feature_importance.get, reverse=True)
top_names = ss[0:]
f = plt.figure(figsize=(22, 6))
plt.grid(True)
plt.yticks(fontsize=18)
plt.xticks(range(n_features), top_names, rotation=90, fontsize=18)
plt.title("Feature importances", fontsize=22)
plt.bar(range(n_features), [feature_importance[i]
for i in top_names], color="#ff471a", align="center")
plt.xlim(-1, n_features)
plt.ylabel('Relative feature importance', fontsize=20);
Generate the submissions.csv file as well
id_test = test_data.id
test_data = test_data.drop(['id'], axis=1)
test_data = test_data.reindex(sorted(test_data.columns), axis=1)
X_test = np.array(test_data)
print(X_test.shape)
if list(train.columns) == list(test_data.columns):
predicted = gbm.predict(X_test)
submission = pd.DataFrame(
{"id": list(id_test), "default_status": list(predicted)})
print(submission.default_status.value_counts())
submission.to_csv("submissions_abhinav.csv", index=False)
submission.head()